knitr::opts_chunk$set(echo = TRUE)
rm(list=ls())
directory <- getwd()
setwd(directory)
df <- list()
name <- c("tr-1k.csv","tr-5k.csv","tr-20k.csv","tr-75k.csv")
for(i in 1:4){
df[[i]] <- read.csv(name[i], header=FALSE, fill = TRUE, col.names = c("ID","P1","P2","P3","P4","P5","P6","P7","P8"))
}
df.products <- read.csv("products.csv", header = FALSE,col.names = c("ProductID","Productname"))
dim(df[[2]])
[1] 5000 9
new <- list()
num <- c(1000, 5000, 20000, 75000)
for (i in 1:4){
new[[i]] <- data.frame("ID"=1:num[i])
}
dim(new[[3]])
[1] 20000 1
for (i in 1:4){
for (j in 1:ncol(df[[i]])){
product <- df.products$Productname[match(df[[i]][,j], df.products$ProductID)]
new[[i]] <- cbind(new[[i]], product)
}
new[[i]] <-new[[i]][,-1]
}
dim(new[[1]])
[1] 1000 9
for (i in 1:4){
new[[i]] <- new[[i]][, -1]
}
name_new <- c("tr-1k-canonical.csv", "tr-5k-canonical.csv",
"tr-20k-canonical.csv", "tr-75k-canonical.csv")
for (i in 1:4){
write.table(new[[i]], file=name_new[i], sep =',',row.names = FALSE, col.names = FALSE, na='')
}
library("arules")
library("arulesViz")
trans <- read.transactions("tr-1k-canonical.csv", sep=",",header = FALSE)
summary(trans)
transactions as itemMatrix in sparse format with
1000 rows (elements/itemsets/transactions) and
50 columns (items) and a density of 0.07076
most frequent items:
Gongolais Cookie Truffle Cake Tuile Cookie Berry Tart Hot Coffee (Other)
108 103 102 95 94 3036
element (itemset/transaction) length distribution:
sizes
1 2 3 4 5 6 7 8
60 162 338 216 132 44 32 16
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 3.00 3.00 3.54 4.00 8.00
includes extended item information - examples:
inspect(trans[1:5])
freq.itemset <- apriori(trans, parameter=list(support=0.1, target="frequent itemsets"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 100
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 1000 transaction(s)] done [0.00s].
sorting and recoding items ... [3 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 done [0.00s].
sorting transactions ... done [0.00s].
writing ... [3 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
inspect(sort(freq.itemset, decreasing = T, by="count"))
rm(freq.itemset)
itemFrequencyPlot(trans, support = 0.1)
image(trans)
rules <- apriori(trans)
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 100
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 1000 transaction(s)] done [0.00s].
sorting and recoding items ... [3 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 done [0.00s].
writing ... [0 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
rm(rules)
freq.itset <- apriori(trans, parameter=list(support=0.01, target="frequent itemsets"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 10
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 1000 transaction(s)] done [0.00s].
sorting and recoding items ... [50 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 done [0.00s].
sorting transactions ... done [0.00s].
writing ... [132 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
inspect(sort(freq.itset, decreasing = T, by="count"))
rm(freq.itset)
itemFrequencyPlot(trans, support = 0.01)
image(trans)
rules <- apriori(trans, parameter = list(support=0.01, conf=0.5))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 10
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 1000 transaction(s)] done [0.00s].
sorting and recoding items ... [50 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 done [0.00s].
writing ... [124 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
summary(rules)
set of 124 rules
rule length distribution (lhs + rhs):sizes
2 3 4 5
22 69 28 5
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.00 3.00 3.00 3.13 4.00 5.00
summary of quality measures:
support confidence coverage lift count
Min. :0.0180 Min. :0.500 Min. :0.0180 Min. : 5.21 Min. :18.0
1st Qu.:0.0190 1st Qu.:0.679 1st Qu.:0.0240 1st Qu.: 9.55 1st Qu.:19.0
Median :0.0275 Median :0.905 Median :0.0300 Median :11.59 Median :27.5
Mean :0.0284 Mean :0.832 Mean :0.0378 Mean :11.17 Mean :28.4
3rd Qu.:0.0320 3rd Qu.:0.974 3rd Qu.:0.0403 3rd Qu.:13.30 3rd Qu.:32.0
Max. :0.0580 Max. :1.000 Max. :0.1080 Max. :19.61 Max. :58.0
mining info:
inspect(head(rules, by="confidence"))
rules_support <- sort(rules, by="support")
rules_lift <- sort(rules, by="lift")
inspect(head(rules_support,3))
new.rules <- rules[!duplicated(generatingItemsets(rules))]
rules_support_new <- sort(new.rules, by="support")
rules_lift_new <- sort(new.rules, by="lift")
rules_confi_new <- sort(new.rules, by="confidence")
inspect(head(rules_support_new,3))
inspect(head(rules_confi_new,10))
plot(new.rules, engine="htmlwidget")
To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
cat("The top 3 items selected are those which have highest support level:", "\n")
The top 3 items selected are those which have highest support level:
cat("{Truffle Cake} => {Gongolais Cookie}", "\n")
{Truffle Cake} => {Gongolais Cookie}
cat("{Marzipan Cookie} => {Truile Cookie}", "\n")
{Marzipan Cookie} => {Truile Cookie}
cat("{Strawberry Cake} => {Napoleon Cake}", "\n")
{Strawberry Cake} => {Napoleon Cake}
cat("The top 3 items selected are those which have highest confidence level:", "\n")
The top 3 items selected are those which have highest confidence level:
cat("{Apple Danish, Apple Tart} => {Apple Croissant}", "\n")
{Apple Danish, Apple Tart} => {Apple Croissant}
cat("{Apricot Danish, Opera Cake} => {Cherry Tart}", "\n")
{Apricot Danish, Opera Cake} => {Cherry Tart}
cat("{Apple Danish, Apple Tart, Cherry Soda} => {Apple Croissant}", "\n")
{Apple Danish, Apple Tart, Cherry Soda} => {Apple Croissant}
cat("Rule 7 with support of 0.31","\n")
Rule 7 with support of 0.31
cat("{Apple Danish, Apple Tart, Cherry Soda} => {Apple Croissant}", "\n", "\n")
{Apple Danish, Apple Tart, Cherry Soda} => {Apple Croissant}
cat("Rule 9 with support of 0.40","\n")
Rule 9 with support of 0.40
cat("{Apple Danish, Apple Tart} => {Apple Croissant}", "\n")
{Apple Danish, Apple Tart} => {Apple Croissant}
trans.list <- list()
trans.name <- c("tr-1k-canonical.csv", "tr-5k-canonical.csv",
"tr-20k-canonical.csv", "tr-75k-canonical.csv")
for (i in 1:4){
trans.list[[i]] <- read.transactions(trans.name[i], sep=",",header = FALSE)
}
freq_itemset_1k <- apriori(trans.list[[1]], parameter=list(support=0.1, target="frequent itemsets"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 100
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 1000 transaction(s)] done [0.00s].
sorting and recoding items ... [3 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 done [0.00s].
sorting transactions ... done [0.00s].
writing ... [3 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
inspect(sort(freq_itemset_1k, decreasing = T, by="count"))
rm(freq_itemset_1k)
itemFrequencyPlot(trans.list[[1]], support = 0.1)
image(trans.list[[1]])
These results are the similar to the one obtained in previous section for the 1000 dataset.
freq_itemset_5k <- apriori(trans.list[[2]], parameter=list(support=0.1, target="frequent itemsets"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 500
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 5000 transaction(s)] done [0.00s].
sorting and recoding items ... [2 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 done [0.00s].
sorting transactions ... done [0.00s].
writing ... [2 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
inspect(sort(freq_itemset_5k, decreasing = T, by="count"))
rm(freq_itemset_5k)
itemFrequencyPlot(trans.list[[2]], support = 0.1)
image(trans.list[[2]])
These results for the 5000 dataset for the same minsup vlue of 0.1 the total number of itemsets is lower than the 1000 dataset.
freq_itemset_20k <- apriori(trans.list[[3]], parameter=list(support=0.1, target="frequent itemsets"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 2000
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 20000 transaction(s)] done [0.01s].
sorting and recoding items ... [2 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 done [0.00s].
sorting transactions ... done [0.00s].
writing ... [2 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
inspect(sort(freq_itemset_20k, decreasing = T, by="count"))
rm(freq_itemset_20k)
itemFrequencyPlot(trans.list[[3]], support = 0.1)
image(trans.list[[3]])
These results for the 20000 dataset is similar to the 5000 dataset.
freq_itemset_75k <- apriori(trans.list[[4]], parameter=list(support=0.1, target="frequent itemsets"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 7500
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 75000 transaction(s)] done [0.02s].
sorting and recoding items ... [3 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 done [0.00s].
sorting transactions ... done [0.00s].
writing ... [3 set(s)] done [0.00s].
creating S4 object ... done [0.01s].
inspect(sort(freq_itemset_75k, decreasing = T, by="count"))
rm(freq_itemset_75k)
itemFrequencyPlot(trans.list[[4]], support = 0.1)
image(trans.list[[4]])
These results for the 75000 dataset is more than the 5000 and the 20000 dataset witht the exception of Tuile Cookie.
There is a change in the frequent items from the 1000 dataset and high quantity of types of Coffee from the 5000 dataset.The number of these frequent items is increased in each dataset. Thus customers continue to buy coffee till the 75000 datatset and the the Tuile Cookie is another frequent item.
To the most frequent items:
-Coffee Eclair -Hot Coffee
rules.1k <- apriori(trans.list[[1]], parameter = list(support=0.01, conf=0.5))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 10
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 1000 transaction(s)] done [0.00s].
sorting and recoding items ... [50 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 done [0.00s].
writing ... [124 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
new.rules.1k <- rules.1k[!duplicated(generatingItemsets(rules.1k))]
summary(new.rules.1k)
set of 47 rules
rule length distribution (lhs + rhs):sizes
2 3 4 5
16 23 7 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.00 2.00 3.00 2.85 3.00 5.00
summary of quality measures:
support confidence coverage lift count
Min. :0.0180 Min. :0.500 Min. :0.0180 Min. : 5.21 Min. :18.0
1st Qu.:0.0210 1st Qu.:0.558 1st Qu.:0.0235 1st Qu.: 6.60 1st Qu.:21.0
Median :0.0290 Median :0.889 Median :0.0330 Median :10.07 Median :29.0
Mean :0.0311 Mean :0.794 Mean :0.0449 Mean : 9.48 Mean :31.1
3rd Qu.:0.0400 3rd Qu.:0.962 3rd Qu.:0.0735 3rd Qu.:11.78 3rd Qu.:40.0
Max. :0.0580 Max. :1.000 Max. :0.1030 Max. :13.89 Max. :58.0
mining info:
new.rules.1k.sort.support <- sort(new.rules.1k, by="support")
new.rules.1k.sort.lift <- sort(new.rules.1k, by="lift")
new.rules.1k.sort.confi <- sort(new.rules.1k, by="confidence")
rules.5k <- apriori(trans.list[[2]], parameter = list(support=0.01, conf=0.5))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 50
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 5000 transaction(s)] done [0.00s].
sorting and recoding items ... [50 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 done [0.00s].
writing ... [115 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
new.rules.5k <- rules.5k[!duplicated(generatingItemsets(rules.5k))]
summary(new.rules.5k)
set of 42 rules
rule length distribution (lhs + rhs):sizes
2 3 4 5
11 23 7 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.00 2.25 3.00 2.95 3.00 5.00
summary of quality measures:
support confidence coverage lift count
Min. :0.0212 Min. :0.504 Min. :0.0212 Min. : 4.57 Min. :106
1st Qu.:0.0217 1st Qu.:0.631 1st Qu.:0.0233 1st Qu.: 6.53 1st Qu.:109
Median :0.0276 Median :0.915 Median :0.0303 Median :11.57 Median :138
Mean :0.0308 Mean :0.828 Mean :0.0424 Mean :10.52 Mean :154
3rd Qu.:0.0403 3rd Qu.:0.944 3rd Qu.:0.0688 3rd Qu.:13.90 3rd Qu.:202
Max. :0.0512 Max. :1.000 Max. :0.0892 Max. :15.58 Max. :256
mining info:
new.rules.5k.sort.support <- sort(new.rules.5k, by="support")
new.rules.5k.sort.lift <- sort(new.rules.5k, by="lift")
new.rules.5k.sort.confi <- sort(new.rules.5k, by="confidence")
rules.20k <- apriori(trans.list[[3]], parameter = list(support=0.01, conf=0.5))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 200
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 20000 transaction(s)] done [0.01s].
sorting and recoding items ... [50 item(s)] done [0.00s].
creating transaction tree ... done [0.01s].
checking subsets of size 1 2 3 4 5 done [0.00s].
writing ... [114 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
new.rules.20k <- rules.20k[!duplicated(generatingItemsets(rules.20k))]
summary(new.rules.20k)
set of 41 rules
rule length distribution (lhs + rhs):sizes
2 3 4 5
10 23 7 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.00 3.00 3.00 2.98 3.00 5.00
summary of quality measures:
support confidence coverage lift count
Min. :0.0204 Min. :0.502 Min. :0.0204 Min. : 4.57 Min. : 408
1st Qu.:0.0207 1st Qu.:0.779 1st Qu.:0.0223 1st Qu.: 7.52 1st Qu.: 413
Median :0.0260 Median :0.919 Median :0.0283 Median :12.71 Median : 520
Mean :0.0297 Mean :0.834 Mean :0.0407 Mean :10.60 Mean : 593
3rd Qu.:0.0372 3rd Qu.:0.950 3rd Qu.:0.0437 3rd Qu.:13.47 3rd Qu.: 745
Max. :0.0525 Max. :0.998 Max. :0.0912 Max. :14.58 Max. :1051
mining info:
new.rules.20k.sort.support <- sort(new.rules.20k, by="support")
new.rules.20k.sort.lift <- sort(new.rules.20k, by="lift")
new.rules.20k.sort.confi <- sort(new.rules.20k, by="confidence")
rules.75k <- apriori(trans.list[[4]], parameter = list(support=0.01, conf=0.5))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 750
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 75000 transaction(s)] done [0.02s].
sorting and recoding items ... [50 item(s)] done [0.00s].
creating transaction tree ... done [0.02s].
checking subsets of size 1 2 3 4 5 done [0.01s].
writing ... [116 rule(s)] done [0.00s].
creating S4 object ... done [0.01s].
# Removing the duplicated rules
new.rules.75k <- rules.75k[!duplicated(generatingItemsets(rules.75k))]
summary(new.rules.75k)
set of 41 rules
rule length distribution (lhs + rhs):sizes
2 3 4 5
10 23 7 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.00 3.00 3.00 2.98 3.00 5.00
summary of quality measures:
support confidence coverage lift count
Min. :0.0206 Min. :0.503 Min. :0.0207 Min. : 5.63 Min. :1544
1st Qu.:0.0208 1st Qu.:0.755 1st Qu.:0.0231 1st Qu.: 7.35 1st Qu.:1560
Median :0.0258 Median :0.907 Median :0.0279 Median :13.14 Median :1932
Mean :0.0297 Mean :0.831 Mean :0.0409 Mean :10.72 Mean :2230
3rd Qu.:0.0378 3rd Qu.:0.937 3rd Qu.:0.0435 3rd Qu.:13.50 3rd Qu.:2835
Max. :0.0531 Max. :1.000 Max. :0.0926 Max. :14.73 Max. :3982
mining info:
new.rules.75k.sort.support <- sort(new.rules.75k, by="support")
new.rules.75k.sort.lift <- sort(new.rules.75k, by="lift")
new.rules.75k.sort.confi <- sort(new.rules.75k, by="confidence")
plot(new.rules.1k, engine="htmlwidget")
To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(new.rules.5k, engine="htmlwidget")
To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(new.rules.20k, engine="htmlwidget")
To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(new.rules.75k, engine="htmlwidget")
To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
Results observed are as follows
The total number of rules hardly varies. The total number of rules for a minsup of 0.01 and a confidence of 0.5 is:
CONFIDENCE
In the confidence values the rules are grouped into specific confidence values as the number of transactions increases.
Thus for the 1000 dataset the rules are widely scattered in different confidence values. And when we look at the 75000 dataset, the rules with highest lift value are grouped in confidence values between 0.9 and 1 and the rules with lowest lift value in confidence values between 0.5 and 0.6. This makes sense as more the transactions are inputed in a dataset, the confidence of the condition of the consequent is complied with the antecedent (or not), tends to a specific value.
SUPPORT LEVEL
In the support level, as mentioned above, as the number of transactions increases, the rules are grouped into specific confidence values and then, it is observed that the values with highest lift are grouped in small support values for a higher number of transactions. In case for 1000 dataset, the support values are widely scattered. For 75000 dataset the support values for the highest lift values are grouped between 0.02 and 0.03 and the support values for lowest lift values between 0.04 and 0.05.
new.rules.75K.sort.support <- sort(new.rules.75k, by="support")
The most frequently purchased item or itemset.
inspect(head(new.rules.75K.sort.support,1))
cat("{Apricot Danish} => {Cherry Tart}")
{Apricot Danish} => {Cherry Tart}
The least frequently purchased item or item set.
inspect(tail(new.rules.75k.sort.support,1))
cat("{Apple croissant, Apple Danish, Cherry Soda} => {Apple Tart}")
{Apple croissant, Apple Danish, Cherry Soda} => {Apple Tart}